In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
# from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.over_sampling import SMOTE
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
In [2]:
data=pd.read_csv('mbti.csv')
data.head()
Out[2]:
type posts
0 INFJ 'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1 ENTP 'I'm finding the lack of me in these posts ver...
2 INTP 'Good one _____ https://www.youtube.com/wat...
3 INTJ 'Dear INTP, I enjoyed our conversation the o...
4 ENTJ 'You're fired.|||That's another silly misconce...
In [3]:
data.describe(include='all')
Out[3]:
type posts
count 8675 8675
unique 16 8675
top INFP 'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
freq 1832 1
In [4]:
data.describe()
Out[4]:
type posts
count 8675 8675
unique 16 8675
top INFP 'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
freq 1832 1
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB
In [6]:
data.posts[0]
Out[6]:
"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...|||Welcome and stuff.|||http://playeressence.com/wp-content/uploads/2013/08/RED-red-the-pokemon-master-32560474-450-338.jpg  Game. Set. Match.|||Prozac, wellbrutin, at least thirty minutes of moving your legs (and I don't mean moving them while sitting in your same desk chair), weed in moderation (maybe try edibles as a healthier alternative...|||Basically come up with three items you've determined that each type (or whichever types you want to do) would more than likely use, given each types' cognitive functions and whatnot, when left by...|||All things in moderation.  Sims is indeed a video game, and a good one at that. Note: a good one at that is somewhat subjective in that I am not completely promoting the death of any given Sim...|||Dear ENFP:  What were your favorite video games growing up and what are your now, current favorite video games? :cool:|||https://www.youtube.com/watch?v=QyPqT8umzmY|||It appears to be too late. :sad:|||There's someone out there for everyone.|||Wait... I thought confidence was a good thing.|||I just cherish the time of solitude b/c i revel within my inner world more whereas most other time i'd be workin... just enjoy the me time while you can. Don't worry, people will always be around to...|||Yo entp ladies... if you're into a complimentary personality,well, hey.|||... when your main social outlet is xbox live conversations and even then you verbally fatigue quickly.|||http://www.youtube.com/watch?v=gDhy7rdfm14  I really dig the part from 1:46 to 2:50|||http://www.youtube.com/watch?v=msqXffgh7b8|||Banned because this thread requires it of me.|||Get high in backyard, roast and eat marshmellows in backyard while conversing over something intellectual, followed by massages and kisses.|||http://www.youtube.com/watch?v=Mw7eoU3BMbE|||http://www.youtube.com/watch?v=4V2uYORhQOk|||http://www.youtube.com/watch?v=SlVmgFQQ0TI|||Banned for too many b's in that sentence. How could you! Think of the B!|||Banned for watching movies in the corner with the dunces.|||Banned because Health class clearly taught you nothing about peer pressure.|||Banned for a whole host of reasons!|||http://www.youtube.com/watch?v=IRcrv41hgz4|||1) Two baby deer on left and right munching on a beetle in the middle.  2) Using their own blood, two cavemen diary today's latest happenings on their designated cave diary wall.  3) I see it as...|||a pokemon world  an infj society  everyone becomes an optimist|||49142|||http://www.youtube.com/watch?v=ZRCEq_JFeFM|||http://discovermagazine.com/2012/jul-aug/20-things-you-didnt-know-about-deserts/desert.jpg|||http://oyster.ignimgs.com/mediawiki/apis.ign.com/pokemon-silver-version/d/dd/Ditto.gif|||http://www.serebii.net/potw-dp/Scizor.jpg|||Not all artists are artists because they draw. It's the idea that counts in forming something of your own... like a signature.|||Welcome to the robot ranks, person who downed my self-esteem cuz I'm not an avid signature artist like herself. :proud:|||Banned for taking all the room under my bed. Ya gotta learn to share with the roaches.|||http://www.youtube.com/watch?v=w8IgImn57aQ|||Banned for being too much of a thundering, grumbling kind of storm... yep.|||Ahh... old high school music I haven't heard in ages.   http://www.youtube.com/watch?v=dcCRUPCdB1w|||I failed a public speaking class a few years ago and I've sort of learned what I could do better were I to be in that position again. A big part of my failure was just overloading myself with too...|||I like this person's mentality. He's a confirmed INTJ by the way. http://www.youtube.com/watch?v=hGKLI-GEc6M|||Move to the Denver area and start a new life for myself.'"

From above it is quite evident that there are many links in the text data. Thata needed to be removed.¶

In [7]:
train_data,test_data=train_test_split(data,test_size=0.2,random_state=42,
                                     stratify=data.type)

Function to clean the text data.

In [8]:
def clear_text(data):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    for sentence in tqdm(data.posts):
        sentence=sentence.lower()
        
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+','',sentence)
        sentence=re.sub('[^0-9a-z]',' ',sentence)
        
        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text,data_length
In [9]:
train_data.posts,train_length=clear_text(train_data)
test_data.posts,test_length=clear_text(test_data)
100%|████████████████████████████████████████████████████████████████████████████| 6940/6940 [00:03<00:00, 1939.03it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1735/1735 [00:00<00:00, 1906.13it/s]
In [62]:
train_data.posts[0]
Out[62]:
'  and intj moments    sportscenter not top ten plays    pranks   what has been the most life changing experience in your life         on repeat for most of today    may the perc experience immerse you    the last thing my infj friend posted on his facebook before committing suicide the next day  rest in peace     enfj7  sorry to hear of your distress  it s only natural for a relationship to not be perfection all the time in every moment of existence  try to figure the hard times as times of growth  as      84389  84390           welcome and stuff      game  set  match    prozac  wellbrutin  at least thirty minutes of moving your legs  and i don t mean moving them while sitting in your same desk chair   weed in moderation  maybe try edibles as a healthier alternative      basically come up with three items you ve determined that each type  or whichever types you want to do  would more than likely use  given each types  cognitive functions and whatnot  when left by      all things in moderation   sims is indeed a video game  and a good one at that  note  a good one at that is somewhat subjective in that i am not completely promoting the death of any given sim      dear enfp   what were your favorite video games growing up and what are your now  current favorite video games   cool     appears to be too late   sad    there s someone out there for everyone    wait    i thought confidence was a good thing    i just cherish the time of solitude b c i revel within my inner world more whereas most other time i d be workin    just enjoy the me time while you can  don t worry  people will always be around to      yo entp ladies    if you re into a complimentary personality well  hey        when your main social outlet is xbox live conversations and even then you verbally fatigue quickly      i really dig the part from 1 46 to 2 50    because this thread requires it of me    get high in backyard  roast and eat marshmellows in backyard while conversing over something intellectual  followed by massages and kisses     for too many b s in that sentence  how could you  think of the b    banned for watching movies in the corner with the dunces    banned because health class clearly taught you nothing about peer pressure    banned for a whole host of reasons     two baby deer on left and right munching on a beetle in the middle   2  using their own blood  two cavemen diary today s latest happenings on their designated cave diary wall   3  i see it as      a pokemon world  an infj society  everyone becomes an optimist   49142    all artists are artists because they draw  it s the idea that counts in forming something of your own    like a signature    welcome to the robot ranks  person who downed my self esteem cuz i m not an avid signature artist like herself   proud    banned for taking all the room under my bed  ya gotta learn to share with the roaches     for being too much of a thundering  grumbling kind of storm    yep    ahh    old high school music i haven t heard in ages     failed a public speaking class a few years ago and i ve sort of learned what i could do better were i to be in that position again  a big part of my failure was just overloading myself with too      i like this person s mentality  he s a confirmed intj by the way   to the denver area and start a new life for myself  '
In [10]:
plt.figure(figsize=(15,10))
sns.distplot(train_length,label='train data word length')
sns.distplot(test_length,label='test data word length')
plt.title('Number of words in text',fontdict={'size':20,
                                             'style':'italic'})
plt.show()
In [11]:
px.pie(train_data,names='type',title='Personality type',hole=0.3)

Tokenizing words¶

In [1]:
def Lemmatizer(sentence):
    for word in sentence.split():
        if len(word)>2:
            lemmatizer.lemmatize(word)
def Lemmatizer():
    lemmatizer=WordNetLemmatizer()
In [14]:
vectorizer=TfidfVectorizer(max_features=5000,stop_words='english',tokenizer=Lemmatizer())
vectorizer.fit(train_data.posts)
Out[14]:
TfidfVectorizer(max_features=5000, stop_words='english',
                tokenizer=<__main__.Lemmatizer object at 0x0000021C2F072940>)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
TfidfVectorizer(max_features=5000, stop_words='english',
                tokenizer=<__main__.Lemmatizer object at 0x0000021C2F072940>)
In [15]:
train_post=vectorizer.transform(train_data.posts).toarray()
test_post=vectorizer.transform(test_data.posts).toarray()
In [56]:
train_post[0]
Out[56]:
array([0.05223205, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ])
In [17]:
target_encoder=LabelEncoder()
train_target=target_encoder.fit_transform(train_data.type)
test_target=target_encoder.fit_transform(test_data.type)
In [82]:
target_encoder.classes_
Out[82]:
array(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
       'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'],
      dtype=object)
In [78]:
train_data.type.unique()
Out[78]:
array(['INFP', 'ISTP', 'ENFJ', 'ENFP', 'INFJ', 'ESFP', 'INTJ', 'ISFP',
       'ISTJ', 'ENTP', 'ENTJ', 'INTP', 'ISFJ', 'ESTP', 'ESFJ', 'ESTJ'],
      dtype=object)
In [18]:
train_target
Out[18]:
array([ 9, 15,  0, ...,  2, 11,  3])
In [19]:
models_accuracy={}

Logistic Regression¶

In [20]:
model_log=LogisticRegression(max_iter=3000,C=0.5,n_jobs=-1)
model_log.fit(train_post,train_target)
Out[20]:
LogisticRegression(C=0.5, max_iter=3000, n_jobs=-1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.5, max_iter=3000, n_jobs=-1)
In [21]:
print('train classification report \n',classification_report(train_target,model_log.predict(train_post),
                                                            target_names=target_encoder.inverse_transform(
                                                            [i for i in range(16)])))
train classification report 
               precision    recall  f1-score   support

        ENFJ       0.86      0.16      0.27       152
        ENFP       0.80      0.65      0.72       540
        ENTJ       0.93      0.29      0.44       185
        ENTP       0.82      0.66      0.73       548
        ESFJ       0.00      0.00      0.00        33
        ESFP       0.00      0.00      0.00        38
        ESTJ       0.00      0.00      0.00        31
        ESTP       1.00      0.04      0.08        71
        INFJ       0.74      0.83      0.78      1176
        INFP       0.66      0.93      0.77      1466
        INTJ       0.73      0.80      0.77       873
        INTP       0.69      0.87      0.77      1043
        ISFJ       0.89      0.24      0.38       133
        ISFP       0.86      0.25      0.39       217
        ISTJ       0.86      0.27      0.41       164
        ISTP       0.86      0.51      0.64       270

    accuracy                           0.72      6940
   macro avg       0.67      0.41      0.45      6940
weighted avg       0.74      0.72      0.69      6940

In [22]:
print('test classification report \n',
     classification_report(test_target,model_log.predict(test_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
test classification report 
               precision    recall  f1-score   support

        ENFJ       1.00      0.08      0.15        38
        ENFP       0.76      0.53      0.62       135
        ENTJ       0.75      0.13      0.22        46
        ENTP       0.66      0.51      0.58       137
        ESFJ       0.00      0.00      0.00         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.00      0.00      0.00        18
        INFJ       0.64      0.71      0.67       294
        INFP       0.56      0.88      0.69       366
        INTJ       0.61      0.65      0.63       218
        INTP       0.67      0.84      0.74       261
        ISFJ       0.67      0.12      0.21        33
        ISFP       0.85      0.20      0.33        54
        ISTJ       0.60      0.07      0.13        41
        ISTP       0.71      0.45      0.55        67

    accuracy                           0.63      1735
   macro avg       0.53      0.32      0.34      1735
weighted avg       0.64      0.63      0.59      1735

In [23]:
models_accuracy['logistic regression']=accuracy_score(test_target,
                                                     model_log.predict(test_post))

Linear Support Vector Classifier¶

In [24]:
model_linear_svc=LinearSVC(C=0.1)
model_linear_svc.fit(train_post,train_target)
Out[24]:
LinearSVC(C=0.1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearSVC(C=0.1)
In [25]:
print('train classification report\n',
     classification_report(train_target,model_linear_svc.predict(train_post),
                          target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report\n',
     classification_report(test_target,model_linear_svc.predict(test_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
train classification report
               precision    recall  f1-score   support

        ENFJ       0.89      0.46      0.61       152
        ENFP       0.85      0.76      0.80       540
        ENTJ       0.92      0.64      0.76       185
        ENTP       0.84      0.82      0.83       548
        ESFJ       0.91      0.30      0.45        33
        ESFP       1.00      0.13      0.23        38
        ESTJ       1.00      0.26      0.41        31
        ESTP       0.92      0.48      0.63        71
        INFJ       0.82      0.86      0.84      1176
        INFP       0.77      0.93      0.84      1466
        INTJ       0.83      0.85      0.84       873
        INTP       0.81      0.90      0.85      1043
        ISFJ       0.92      0.67      0.77       133
        ISFP       0.89      0.58      0.70       217
        ISTJ       0.88      0.66      0.76       164
        ISTP       0.90      0.82      0.86       270

    accuracy                           0.82      6940
   macro avg       0.88      0.63      0.70      6940
weighted avg       0.83      0.82      0.82      6940

test classification report
               precision    recall  f1-score   support

        ENFJ       0.58      0.18      0.28        38
        ENFP       0.75      0.59      0.66       135
        ENTJ       0.61      0.30      0.41        46
        ENTP       0.62      0.55      0.58       137
        ESFJ       1.00      0.33      0.50         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       1.00      0.12      0.22         8
        ESTP       0.67      0.33      0.44        18
        INFJ       0.68      0.72      0.70       294
        INFP       0.62      0.86      0.72       366
        INTJ       0.64      0.66      0.65       218
        INTP       0.71      0.82      0.76       261
        ISFJ       0.59      0.30      0.40        33
        ISFP       0.82      0.33      0.47        54
        ISTJ       0.81      0.32      0.46        41
        ISTP       0.68      0.57      0.62        67

    accuracy                           0.66      1735
   macro avg       0.67      0.44      0.49      1735
weighted avg       0.67      0.66      0.65      1735

In [26]:
models_accuracy['Linear Support Vector Classifier']=accuracy_score(
test_target,model_linear_svc.predict(test_post))

Support Vector Classifier¶

In [27]:
model_svc=SVC()
model_svc.fit(train_post,train_target)
Out[27]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [28]:
print('train classification report\n',
     classification_report(train_target,model_svc.predict(train_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
print('test classification report\n',
     classification_report(test_target,model_svc.predict(test_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
train classification report
               precision    recall  f1-score   support

        ENFJ       0.97      0.86      0.91       152
        ENFP       0.96      0.95      0.95       540
        ENTJ       0.99      0.90      0.94       185
        ENTP       0.95      0.96      0.95       548
        ESFJ       1.00      0.58      0.73        33
        ESFP       1.00      0.37      0.54        38
        ESTJ       1.00      0.52      0.68        31
        ESTP       1.00      0.82      0.90        71
        INFJ       0.95      0.97      0.96      1176
        INFP       0.93      0.98      0.95      1466
        INTJ       0.96      0.96      0.96       873
        INTP       0.94      0.97      0.96      1043
        ISFJ       1.00      0.89      0.94       133
        ISFP       0.97      0.90      0.94       217
        ISTJ       0.94      0.92      0.93       164
        ISTP       0.97      0.94      0.95       270

    accuracy                           0.95      6940
   macro avg       0.97      0.84      0.89      6940
weighted avg       0.95      0.95      0.95      6940

test classification report
               precision    recall  f1-score   support

        ENFJ       0.62      0.26      0.37        38
        ENFP       0.76      0.56      0.65       135
        ENTJ       0.65      0.28      0.39        46
        ENTP       0.64      0.53      0.58       137
        ESFJ       0.33      0.11      0.17         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.71      0.28      0.40        18
        INFJ       0.66      0.69      0.67       294
        INFP       0.59      0.86      0.70       366
        INTJ       0.64      0.63      0.64       218
        INTP       0.66      0.83      0.73       261
        ISFJ       0.82      0.27      0.41        33
        ISFP       0.79      0.35      0.49        54
        ISTJ       0.79      0.27      0.40        41
        ISTP       0.75      0.57      0.64        67

    accuracy                           0.65      1735
   macro avg       0.59      0.41      0.45      1735
weighted avg       0.65      0.65      0.63      1735

In [29]:
models_accuracy['Support Vector Classifier']=accuracy_score(test_target,
                                                           model_svc.predict(test_post))

Multinomial Naive Bayes¶

In [30]:
model_multinomial_nb=MultinomialNB()
model_multinomial_nb.fit(train_post,train_target)
Out[30]:
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
In [31]:
print('Train Classification Report\n',
     classification_report(train_target,
    model_multinomial_nb.predict(train_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))

print('Test classification report\n',
     classification_report(train_target,model_multinomial_nb.predict(train_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
Train Classification Report
               precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00       152
        ENFP       0.89      0.01      0.03       540
        ENTJ       0.00      0.00      0.00       185
        ENTP       0.91      0.05      0.10       548
        ESFJ       0.00      0.00      0.00        33
        ESFP       0.00      0.00      0.00        38
        ESTJ       0.00      0.00      0.00        31
        ESTP       0.00      0.00      0.00        71
        INFJ       0.52      0.62      0.56      1176
        INFP       0.35      0.94      0.51      1466
        INTJ       0.78      0.42      0.54       873
        INTP       0.58      0.63      0.61      1043
        ISFJ       0.00      0.00      0.00       133
        ISFP       0.00      0.00      0.00       217
        ISTJ       0.00      0.00      0.00       164
        ISTP       0.00      0.00      0.00       270

    accuracy                           0.46      6940
   macro avg       0.25      0.17      0.15      6940
weighted avg       0.49      0.46      0.37      6940

Test classification report
               precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00       152
        ENFP       0.89      0.01      0.03       540
        ENTJ       0.00      0.00      0.00       185
        ENTP       0.91      0.05      0.10       548
        ESFJ       0.00      0.00      0.00        33
        ESFP       0.00      0.00      0.00        38
        ESTJ       0.00      0.00      0.00        31
        ESTP       0.00      0.00      0.00        71
        INFJ       0.52      0.62      0.56      1176
        INFP       0.35      0.94      0.51      1466
        INTJ       0.78      0.42      0.54       873
        INTP       0.58      0.63      0.61      1043
        ISFJ       0.00      0.00      0.00       133
        ISFP       0.00      0.00      0.00       217
        ISTJ       0.00      0.00      0.00       164
        ISTP       0.00      0.00      0.00       270

    accuracy                           0.46      6940
   macro avg       0.25      0.17      0.15      6940
weighted avg       0.49      0.46      0.37      6940

In [32]:
models_accuracy['Multinomial Naive Bayes']=accuracy_score(test_target,
                                            model_multinomial_nb.predict(test_post))

Decision Tree Classifier¶

In [33]:
model_tree=DecisionTreeClassifier(max_depth=14)
model_tree.fit(train_post,train_target)
Out[33]:
DecisionTreeClassifier(max_depth=14)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(max_depth=14)
In [34]:
print('Train Classification report\n',
     classification_report(train_target,model_tree.predict(train_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
print('Test Classification report\n',
     classification_report(test_target,model_tree.predict(test_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
Train Classification report
               precision    recall  f1-score   support

        ENFJ       0.77      0.57      0.65       152
        ENFP       0.87      0.81      0.84       540
        ENTJ       0.86      0.67      0.75       185
        ENTP       0.94      0.76      0.84       548
        ESFJ       1.00      0.39      0.57        33
        ESFP       0.91      0.26      0.41        38
        ESTJ       0.85      0.35      0.50        31
        ESTP       0.83      0.42      0.56        71
        INFJ       0.80      0.86      0.83      1176
        INFP       0.63      0.93      0.75      1466
        INTJ       0.87      0.77      0.82       873
        INTP       0.88      0.82      0.85      1043
        ISFJ       0.98      0.40      0.57       133
        ISFP       0.97      0.64      0.77       217
        ISTJ       0.96      0.55      0.70       164
        ISTP       0.98      0.69      0.81       270

    accuracy                           0.79      6940
   macro avg       0.88      0.62      0.70      6940
weighted avg       0.82      0.79      0.79      6940

Test Classification report
               precision    recall  f1-score   support

        ENFJ       0.15      0.11      0.12        38
        ENFP       0.51      0.52      0.51       135
        ENTJ       0.25      0.22      0.23        46
        ENTP       0.51      0.41      0.46       137
        ESFJ       0.00      0.00      0.00         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.09      0.06      0.07        18
        INFJ       0.55      0.62      0.58       294
        INFP       0.48      0.67      0.56       366
        INTJ       0.53      0.49      0.51       218
        INTP       0.62      0.56      0.59       261
        ISFJ       0.12      0.06      0.08        33
        ISFP       0.54      0.35      0.43        54
        ISTJ       0.45      0.24      0.32        41
        ISTP       0.56      0.43      0.49        67

    accuracy                           0.51      1735
   macro avg       0.34      0.30      0.31      1735
weighted avg       0.50      0.51      0.50      1735

Random Forest Classifier¶

In [35]:
model_forest=RandomForestClassifier(max_depth=10)
model_forest.fit(train_post,train_target)
Out[35]:
RandomForestClassifier(max_depth=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=10)
In [36]:
print('Train Classification report\n',
     classification_report(train_target,model_forest.predict(train_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
print('Test Classification report\n',
     classification_report(test_target,model_forest.predict(test_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
Train Classification report
               precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00       152
        ENFP       1.00      0.26      0.41       540
        ENTJ       1.00      0.06      0.11       185
        ENTP       1.00      0.47      0.64       548
        ESFJ       0.00      0.00      0.00        33
        ESFP       0.00      0.00      0.00        38
        ESTJ       0.00      0.00      0.00        31
        ESTP       1.00      0.01      0.03        71
        INFJ       0.84      0.82      0.83      1176
        INFP       0.43      1.00      0.60      1466
        INTJ       0.86      0.77      0.81       873
        INTP       0.81      0.88      0.84      1043
        ISFJ       1.00      0.04      0.07       133
        ISFP       1.00      0.04      0.08       217
        ISTJ       1.00      0.02      0.05       164
        ISTP       1.00      0.16      0.28       270

    accuracy                           0.65      6940
   macro avg       0.68      0.28      0.30      6940
weighted avg       0.77      0.65      0.60      6940

Test Classification report
               precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.80      0.03      0.06       135
        ENTJ       0.00      0.00      0.00        46
        ENTP       0.83      0.11      0.19       137
        ESFJ       0.00      0.00      0.00         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.00      0.00      0.00        18
        INFJ       0.60      0.49      0.54       294
        INFP       0.33      0.95      0.49       366
        INTJ       0.63      0.47      0.54       218
        INTP       0.59      0.57      0.58       261
        ISFJ       0.00      0.00      0.00        33
        ISFP       0.00      0.00      0.00        54
        ISTJ       0.00      0.00      0.00        41
        ISTP       0.00      0.00      0.00        67

    accuracy                           0.44      1735
   macro avg       0.24      0.16      0.15      1735
weighted avg       0.47      0.44      0.37      1735

In [37]:
models_accuracy['Random Forest Classifier']=accuracy_score(test_target,
                                                          model_forest.predict(test_post))

XGBoost Classifier¶

In [45]:
model_xgb=XGBClassifier(gpu_id=0,
                       tree_method='gpu_hist',
                       max_depth=5,n_estimators=50,
                       learning_rate=0.1)
model_xgb.fit(train_post,train_target)
Out[45]:
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, ...)
In [46]:
print('Train Classification report\n',
     classification_report(train_target,model_xgb.predict(train_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
print('Test Classification report\n',
     classification_report(test_target,model_xgb.predict(test_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
Train Classification report
               precision    recall  f1-score   support

        ENFJ       0.99      0.93      0.96       152
        ENFP       0.94      0.91      0.92       540
        ENTJ       0.99      0.91      0.95       185
        ENTP       0.94      0.91      0.92       548
        ESFJ       1.00      0.85      0.92        33
        ESFP       1.00      0.92      0.96        38
        ESTJ       1.00      0.84      0.91        31
        ESTP       1.00      0.94      0.97        71
        INFJ       0.91      0.90      0.91      1176
        INFP       0.89      0.95      0.92      1466
        INTJ       0.92      0.92      0.92       873
        INTP       0.90      0.92      0.91      1043
        ISFJ       1.00      0.95      0.98       133
        ISFP       0.98      0.91      0.94       217
        ISTJ       0.99      0.93      0.96       164
        ISTP       0.97      0.97      0.97       270

    accuracy                           0.92      6940
   macro avg       0.96      0.92      0.94      6940
weighted avg       0.92      0.92      0.92      6940

Test Classification report
               precision    recall  f1-score   support

        ENFJ       0.68      0.39      0.50        38
        ENFP       0.71      0.64      0.67       135
        ENTJ       0.65      0.37      0.47        46
        ENTP       0.58      0.58      0.58       137
        ESFJ       0.00      0.00      0.00         9
        ESFP       1.00      0.10      0.18        10
        ESTJ       1.00      0.12      0.22         8
        ESTP       0.50      0.28      0.36        18
        INFJ       0.68      0.76      0.72       294
        INFP       0.67      0.81      0.73       366
        INTJ       0.69      0.65      0.67       218
        INTP       0.70      0.78      0.74       261
        ISFJ       0.67      0.55      0.60        33
        ISFP       0.65      0.41      0.50        54
        ISTJ       0.71      0.41      0.52        41
        ISTP       0.68      0.64      0.66        67

    accuracy                           0.67      1735
   macro avg       0.66      0.47      0.51      1735
weighted avg       0.67      0.67      0.66      1735

In [47]:
models_accuracy['XGBoost Classifier']=accuracy_score(test_target,model_xgb.predict(test_post))

CatBoost Classifier¶

In [48]:
model_cat=CatBoostClassifier(loss_function='MultiClass',eval_metric='MultiClass',
                            task_type='GPU',verbose=False)
model_cat.fit(train_post,train_target)
Warning: less than 75% gpu memory available for training. Free: 2739.700001 Total: 4095.6875
Out[48]:
<catboost.core.CatBoostClassifier at 0x21c404bb9a0>
In [49]:
print('Train Classification report\n',
     classification_report(train_target,model_cat.predict(train_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
print('Test Classification report\n',
     classification_report(test_target,model_cat.predict(test_post),
                          target_names=target_encoder.inverse_transform(
                          [i for i in range(16)])))
Train Classification report
               precision    recall  f1-score   support

        ENFJ       0.88      0.62      0.73       152
        ENFP       0.84      0.79      0.81       540
        ENTJ       0.89      0.66      0.76       185
        ENTP       0.82      0.80      0.81       548
        ESFJ       0.94      0.52      0.67        33
        ESFP       1.00      0.34      0.51        38
        ESTJ       1.00      0.39      0.56        31
        ESTP       0.92      0.63      0.75        71
        INFJ       0.82      0.86      0.84      1176
        INFP       0.80      0.90      0.85      1466
        INTJ       0.82      0.82      0.82       873
        INTP       0.79      0.88      0.83      1043
        ISFJ       0.92      0.70      0.79       133
        ISFP       0.86      0.67      0.76       217
        ISTJ       0.90      0.73      0.81       164
        ISTP       0.88      0.80      0.83       270

    accuracy                           0.82      6940
   macro avg       0.88      0.69      0.76      6940
weighted avg       0.83      0.82      0.82      6940

Test Classification report
               precision    recall  f1-score   support

        ENFJ       0.64      0.37      0.47        38
        ENFP       0.71      0.65      0.68       135
        ENTJ       0.80      0.43      0.56        46
        ENTP       0.60      0.59      0.59       137
        ESFJ       0.00      0.00      0.00         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       1.00      0.25      0.40         8
        ESTP       0.83      0.56      0.67        18
        INFJ       0.70      0.73      0.71       294
        INFP       0.67      0.79      0.72       366
        INTJ       0.65      0.67      0.66       218
        INTP       0.68      0.77      0.72       261
        ISFJ       0.70      0.58      0.63        33
        ISFP       0.61      0.43      0.50        54
        ISTJ       0.80      0.39      0.52        41
        ISTP       0.68      0.69      0.68        67

    accuracy                           0.67      1735
   macro avg       0.63      0.49      0.53      1735
weighted avg       0.67      0.67      0.66      1735

In [50]:
models_accuracy['CatBoost Classifier']=accuracy_score(test_target,
                                                     model_cat.predict(test_post))
In [51]:
models_accuracy
Out[51]:
{'logistic regression': 0.6282420749279539,
 'Linear Support Vector Classifier': 0.6628242074927954,
 'Support Vector Classifier': 0.6478386167146974,
 'Multinomial Naive Bayes': 0.37809798270893374,
 'Random Forest Classifier': 0.4386167146974063,
 'XGBoost Classifier': 0.6743515850144092,
 'CatBoost Classifier': 0.6731988472622479}
In [67]:
def clear_text1(data):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    for sentence in tqdm(data):
        sentence=sentence.lower()
        
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+','',sentence)
        sentence=re.sub('[^0-9a-z]',' ',sentence)
        
        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text
In [70]:
text_data
Out[70]:
['hi  i am very good boy staying alone being happpy ']
In [107]:
text=["Hi, I am very good boy staying alone being happpy. i like to stay happy making friend enjoying party"]
text_data=clear_text1(text)
text_v=vectorizer.transform(text_data).toarray()
100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
In [108]:
text_v
Out[108]:
array([[0., 0., 0., ..., 0., 0., 0.]])
In [109]:
a=model_cat.predict(text_v)
In [110]:
a[0][0]
Out[110]:
9
In [111]:
a
Out[111]:
array([[9]], dtype=int64)
In [112]:
target_encoder.classes_[a[0][0]]
Out[112]:
'INFP'
In [ ]: